# encoding: utf-8
# @Time:    :2025/3/3 20:20


from transformers import PretrainedConfig, Qwen2Config
from model.activations import ACT2FN


class QwenConfig(PretrainedConfig):
    """

    """
    model_type = "qwen2"

    def __init__(self,
                 vocab_size=6400,
                 hidden_size=512,
                 intermediate_size=512*4,
                 num_hidden_layers=32,
                 num_attention_heads=8,
                 num_key_value_heads=8,
                 hidden_act="silu",
                 max_position_embeddings=32768,
                 rms_norm_eps=1e-6,
                 use_cache=True,
                 rope_scaling=None,
                 attention_dropout=0.0,
                 rope_theta=10000.0,
                 tie_word_embeddings=False,
                 **kwargs
                 ):
        super().__init__(
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        # 虽然给了attention dropout，但是rate=0，相当于没有dropout
        # llm训练成本太高，基本都没有给dropout
        self.attention_dropout = attention_dropout


